import pandas as pd
import re
import json


def extract_chinese_word(text):
    chinese_pattern = re.compile('[\u4E00-\u9FA5]+')
    chinese_text = chinese_pattern.findall(text)
    chinese_text = str("".join(chinese_text))
    return chinese_text


test_dict = {}
with open('/home/asr_deploy_test/asr_test_file/dataset/test/content.txt', 'r', encoding='utf-8') as f:
    data = f.readlines()
    for d in data:
        file_name = d.split('\t')[0]
        sentence = d.split('\t')[1]
        answer = extract_chinese_word(sentence)
        test_dict[file_name] = answer

json_str = json.dumps(test_dict, ensure_ascii=False, indent=4)
with open('dataset/test_set.json', 'w', encoding='utf-8') as fp:
    fp.write(json_str)
fp.close()

# file_dict = {}
#     for dir in os.listdir(test_path):
#         files_path = os.path.join(test_path, dir)
#         for file in os.listdir(files_path):
#             path = os.path.join(files_path, file)
#             file_dict[file] = path
#
#     with open(test_sentence_path, 'r', encoding='utf-8') as fp:
#         answer = json.load(fp)
#
#     batch_list = []
#
#     for file in file_dict:
#         if file in answer:
#             batch = {"path": file_dict[file], "sentence": answer[file]}
#             batch_list.append(batch)
# with open('dataset/batch_list.txt', 'w', encoding='utf-8') as f:
#     f.writelines(batch_list)
